import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.io as pio
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
plt.style.use('ggplot')
plt.rcParams['font.family'] = "NanumGothic"
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_json("data/accident-total-pp.json").drop("acdnt_no", axis=1)
data = data[(data.acdnt_age_2_dc > 0) & (data.acdnt_age_1_dc > 0)]
data = data.assign(
acdnt_dd_dc = pd.to_datetime(data.acdnt_dd_dc, format="%Y-%m-%d"),
victim_age_group = ((data.acdnt_age_2_dc // 10) * 10).astype(str) + "대"
)
for colname in data.columns:
print(
f"""
===================
Column: {colname}
` Nunique: {data[colname].unique().size}
Sample: {data[colname].values[:3]}
==================
"""
)
print(f"Data shape: {data.shape}")
data.head()
# 하루당 교통사고 건수의 분포
# 어린이, 비어린이 모두 노말함
plt.figure(figsize=(10,5))
sns.distplot(data[data.kids_acdnt==1].groupby("acdnt_dd_dc").size())
sns.distplot(data[data.kids_acdnt!=1].groupby("acdnt_dd_dc").size())
plt.legend(["Kids", "Non-Kids"])
# 위쪽은 월별 비어린이 교통사고 사고
# 아래쪽은 월별어린이 교통사고(1세~14세): 연간 계절성과 감소 추세가 뚜렷하게 나타남
ts_data = data.groupby([
"kids_acdnt", (data.acdnt_dd_dc.dt.strftime("%Y-%m") + "-01").astype("datetime64")]
).size().reset_index().rename(columns={0:"cnt"})
sns.FacetGrid(
ts_data,
row = "kids_acdnt",
sharex=False,
sharey=False,
height=4,
aspect=3).map(sns.lineplot, "acdnt_dd_dc", "cnt")
# 시계열 성분 분해: Trend + Seasonality + Residuals
plt.rcParams["figure.figsize"] = (15,10)
res = seasonal_decompose(
ts_data[ts_data.kids_acdnt.astype(bool)].set_index("acdnt_dd_dc").cnt,
model="additive"
)
res.plot();plt.show();
# 잔차 진단
residual = res.resid[~res.resid.isna()]
sr = stats.zscore(residual)
(x, y), _ = stats.probplot(sr)
# Q-Q plot
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.title("Normal Q-Q Plot")
sns.scatterplot(x, y)
plt.plot([-3, 3], [-3, 3], '--', color='grey')
# Density & Histogram
plt.subplot(1,2,2)
plt.title("Densigty & Histogram")
sns.distplot(residual)
# 정규성 검정
print(stats.normaltest(residual))
# Autocorrelation
plot_acf(residual); plt.show();
kids = data[data.victim_age_group.isin(["0대","10대"])]
kidszone = kids[kids.kidszone.astype(bool)]
nonkidszone = kids[~kids.kidszone.astype(bool)]
plt.figure(figsize=(10,4))
sns.pointplot(
data=kidszone.occrrnc_time_dc.value_counts().rename("cnt").reset_index(),
x="index",
y="cnt"
)
plt.title("Kidszone")
plt.figure(figsize=(10,4))
sns.pointplot(
data=nonkidszone.occrrnc_time_dc.value_counts().rename("cnt").reset_index(),
x="index",
y="cnt"
)
plt.title("Non-kidszone")
tmp = data.groupby(['occrrnc_time_dc','acdnt_dd_dc','victim_age_group']).size().reset_index().rename(columns={0:"cnt"})\
.groupby(["occrrnc_time_dc", 'victim_age_group']).cnt.mean().rename("cnt").reset_index()\
.pivot("occrrnc_time_dc", "victim_age_group", "cnt")
fig = go.Figure()
for age in tmp.columns:
if age in ["0대", "10대"]:
width, opacity = 2, 1
else:
width, opacity = 1, 0.3
fig.add_trace(
go.Scatter(
x=tmp.index.values,
y=tmp[age].values,
mode='lines+markers',
name= age,
line=dict(width=width),
opacity=opacity
)
)
fig
kids = data[data.kids_acdnt==1].drop("kids_acdnt",axis=1)
kids = kids.assign(sido = [x.split()[0] for x in kids.legaldong_name])
for colname in list(set(kids.columns.values) - {"sido"}):
uniques = kids[colname].unique()
if uniques.size < 30:
result = (kids.groupby(["sido",colname, "acdnt_dd_dc"])
.size()
.rename("cnt")
.reset_index()
.groupby(["sido",colname]).cnt.sum()
.reset_index()
.rename(columns={0:"cnt"})
).fillna(0)
result = result.groupby("sido").apply(lambda x: x.assign(proportion = x.cnt/x.cnt.sum())).reset_index(drop=True)
plt.figure(figsize=(15,5))
sns.barplot(
data=result,
y="proportion",
x="sido",
hue=colname
)
plt.show()
for colname in list(set(data.columns.values) - {"kids_acdnt"}):
uniques = data[colname].unique()
if uniques.size < 30:
result = (data.groupby(["kids_acdnt",colname, "acdnt_dd_dc"])
.size()
.reset_index()
.rename(columns={0:"cnt"})
.groupby(["kids_acdnt",colname]).cnt.sum()
.reset_index()
.rename(columns={0:"cnt"})
)
result = result.groupby("kids_acdnt").apply(lambda x: x.assign(proportion = x.cnt/x.cnt.sum())).reset_index(drop=True)
print("========================")
print(result.pivot(colname, "kids_acdnt", "cnt"))
print("========================")
sns.FacetGrid(
result,
col = "kids_acdnt",
sharex=False,
sharey=False,
height=5,
aspect=1
).map(sns.barplot, "cnt", colname, orient="h", order=uniques)
plt.show()